iris.# quantitative variable
mean(iris$Sepal.Length)
median(iris$Sepal.Length)
summary(iris$Sepal.Length)
# qualitative variable
table(iris$Species)
# qualitative variable + quantitative variable
table(iris$Species, iris$Sepal.Length)
# import from a url.
road = read.csv("http://www.maths.usyd.edu.au/u/UG/JM/DATA1001/r/current/data/2016Fatalities.csv")
# import data from a folder
# getwd() # this checks the working directory
road1 = read.csv("/data/2016Fatalities.csv")
You are beginning to learn how to:
| Type of Variable | Referred to in R |
|---|---|
| Qualitative / Categorical | factor |
| Quantitative / Numerical | num |
| Type of Data | Type of Graphical Summary | In base R |
|---|---|---|
| 1 Qualitative Variable | Barplot | barplot() |
| 2 Qualitative Variables | Double (clustered) Barplot | barplot() |
| 1 Quantitative Variable | Histogram or Boxplot | hist(), boxplot() |
| 2 Quantitative Variables | Scatterplot | plot() |
| 1 Quantitative & 1 Qualitative Variable | Double (comparative) boxplot | boxplot() |
Now you’re ready to try some interesting data! Don’t get bamboozled by all the code, rather see what everything does!
Consider the Australian road fatalities from 1989 (a bigger version of the data used in Week 2 lectures). The data is sourced from BITRE.
# Read data from url into R
road = read.csv("http://www.maths.usyd.edu.au/u/UG/JM/DATA1001/r/current/data/AllFatalities.csv")
Note: An alternative way is to download the data from Canvas, store the data in DATA1001files/data and upload from there. You will need to use this method in future projects, when you upload your own data.
# Read data from url into R
road = read.csv("data/AllFatalities.csv",header=T)
str(road)
Here we consider 1 qualitative variable: the road fatalities across the days of the week.
1st isolate the variable Dayweek. Check how R classifies it. Produce a barplot. What is annoying about it?
class(road$Dayweek)
## [1] "factor"
barplot(table(road$Dayweek))
dayweek and the produce a barplot. What pattern emerges? Suggest possible reasons for it?orderdayweek = ordered(road$Dayweek, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
barplot(table(orderdayweek))
barplot(table(orderdayweek),las=2)
Here we consider 2 qualitative variables: the road fatalities across the days of the week, cross-classified by bus involvement.
Is there any pattern?
road1 = table(road$Bus_Involvement, road$Dayweek)
road1
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## No 7520 5216 8496 7382 6155 5258 5727
## Yes 178 125 108 80 129 123 127
barplot(road1, main = "Fatalities by Day of the Week and Bus Involvement", xlab = "Day of the week",
col = c("lightblue", "lightgreen"), legend = rownames(road1))
Here we consider 2 qualitative variables: the road fatalities across the days of the week, cross-classified by heavy rigid truck involvement.
Investigate whether the involvement of heavy rigid trucks differs across the days?
road2 = table(road$Hvy_Rigid_Truck_Involvement, road$Dayweek)
road2
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## -9 4454 3080 4968 4299 3569 3042 3353
## No 3019 2096 3505 3091 2496 2164 2283
## Yes 225 165 131 72 219 175 218
barplot(road2, main = "Fatalities by Day of the Week and Heavy Rigid Involvement", xlab = "Day of the week",
col = c("lightpink","lightblue", "lightgreen"), legend = rownames(road2))
Here we consider 1 quantitative variable: fatalities.
1st isolate the variable Age. How does R classify it?
class(road$Age)
## [1] "factor"
road$Age = as.numeric(as.character(road$Age))
## Warning: NAs introduced by coercion
class(road$Age)
## [1] "numeric"
hist(road$Age, prob=T)
boxplot(road$Age)
hist(road$Age,freq=FALSE,main="Histogram",ylab="Probabilities", col="green")
boxplot(road$Age,horizontal=TRUE,col="red")
Here we consider 1 quantitative variable divided by 1 qualitative variable.
Control for biological sex - ie consider fatalities by age divided by biological sex.
ageF = road$Age[road$Gender == "Female"]
ageM = road$Age[road$Gender == "Male"]
par(mfrow = c(2, 1))
boxplot(ageF,horizontal=T, col="light blue")
boxplot(ageM,horizontal=T)
par(mfrow=c(1,2))
boxplot(ageF,horizontal=T, col="light blue")
boxplot(ageM,horizontal=T)
Explore another variable.
1st, read through this Overview and re-read RGuide Chapter 5.
2nd, load the package ggplot2 or tidyverse (which includes ggplot2).
road1 = read.csv("http://www.maths.usyd.edu.au/u/UG/JM/DATA1001/r/current/data/AllFatalities.csv") # Start again with the raw data frame
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
str(road1)
## 'data.frame': 46624 obs. of 18 variables:
## $ CrashID : num 4.2e+12 2.2e+12 1.2e+12 5.2e+12 6.2e+12 ...
## $ State : Factor w/ 8 levels "ACT","NSW","NT",..: 5 7 2 8 6 6 6 4 7 8 ...
## $ Date : Factor w/ 9739 levels "01-Apr-00","01-Apr-01",..: 125 125 444 444 444 444 760 760 1080 1080 ...
## $ Day : int 1 1 2 2 2 2 3 3 4 4 ...
## $ Month : Factor w/ 12 levels "April","August",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Dayweek : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
## $ Time : Factor w/ 1385 levels "0:00","0:01",..: 59 795 30 553 710 710 231 354 712 461 ...
## $ Hour : int 1 20 0 17 19 19 11 14 2 15 ...
## $ Minute : int 0 30 30 20 58 58 55 0 0 47 ...
## $ Crash_Type : Factor w/ 3 levels "Multiple","Pedestrian",..: 3 3 3 1 1 1 1 2 3 3 ...
## $ Bus_Involvement : Factor w/ 2 levels "No ","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hvy_Rigid_Truck_Involvement : Factor w/ 3 levels "-9","No ","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Articulated_Truck_Involvement: Factor w/ 2 levels "No ","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ Speed_Limit : Factor w/ 19 levels "10","100","110",..: 3 15 2 3 15 15 2 11 10 3 ...
## $ Road_User : Factor w/ 8 levels "-9","Bicyclist",..: 3 5 7 3 5 4 5 8 3 5 ...
## $ Gender : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : Factor w/ 102 levels "0","1","10","100",..: 37 20 12 51 11 27 49 70 62 66 ...
Redo the Road Fatalities exercises using ggplot.
p = ggplot(road1, aes(x = Dayweek)) # Defines the x axis (1 variable).
p + geom_bar()
road1$Dayweek = factor(road1$Dayweek, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
p = ggplot(road1, aes(x = Dayweek))
p + geom_bar()
p + geom_bar(aes(fill=Bus_Involvement))
Here we consider 2 qualitative variables: the road fatalities across the days of the week, cross-classified by heavy rigid truck involvement.
Investigate whether the involvement of heavy rigid trucks differs across the days?
p + geom_bar(aes(fill=Hvy_Rigid_Truck_Involvement))
# Change classification of Age variable (factor -> integer)
class(road1$Age)
## [1] "factor"
road1$Age = as.numeric(as.character(road1$Age))
## Warning: NAs introduced by coercion
class(road1$Age)
## [1] "numeric"
# Histogram
p1 = ggplot(road1, aes(x = Age))
p1 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 82 rows containing non-finite values (stat_bin).
# Boxplot
# Note for a simple boxplot, you need to make the x-axis empty.
p2 = ggplot(road1, aes(x="",y=Age))
p2 + geom_boxplot()
## Warning: Removed 82 rows containing non-finite values (stat_boxplot).
Here we consider 1 quantative variable divided by 1 qualitative variable.
Control for biological sex - ie consider fatalities by age divided by biological sex.
p3 = ggplot(road1, aes(x = Gender,y = Age))
p3 + geom_boxplot()
## Warning: Removed 82 rows containing non-finite values (stat_boxplot).
Using ggplot create a barplot of crashes with month on the x axis (hint: You’ll have to reorder the month variable similar to how we reordered the DayWeek Variable) and facet by gender.
Which gender had the higher number of crashes? Is this what you expected?
road1$Month = factor(road1$Month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", 'November', 'December'))
ggplot(road1, aes(x = Month))+
geom_bar(aes(fill = Gender))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Here we introduce the cool interactive tool called plotly, which is automatically part of the ggplot2 package.
Work through the RGuide 5.7.
Now try some plots with the Road Fatality data.
library('plotly')
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p4 = plot_ly(road1, x = ~Age, color = ~Gender, type = 'box')
p4
## Warning: Ignoring 82 observations